Enter 2006 September

home *** CD-ROM | disk | FTP | other *** search

/ Enter 2006 September / Enter 09 2006.iso / Internet / SpamExperts Home 1.1 / SpamExperts Home.exe / lib / spamexperts.modules / spambayes / classifier.pyc (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2006-07-14 | 19.1 KB | 626 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.4) from __future__ import generators import math try: Set = set except NameError: try: from sets import Set except ImportError: from spambayes.compatsets import Set import re import os import sys import socket import pickle import urllib2 from email import message_from_string try: enumerate except NameError: def enumerate(seq): i = 0 for elt in seq: yield (i, elt) i += 1 DOMAIN_AND_PORT_RE = re.compile('([^:/\\\\]+)(:([\\d]+))?') HTTP_ERROR_RE = re.compile('HTTP Error ([\\d]+)') URL_KEY_RE = re.compile('[\\W]') from spambayes.Options import options from spambayes.chi2 import chi2Q try: (True, False) except NameError: (True, False) = (1, 0) LN2 = math.log(2) slurp_wordstream = None PICKLE_VERSION = 5 class WordInfo(object): __slots__ = ('spamcount', 'hamcount') def __init__(self): self.__setstate__((0, 0)) def __repr__(self): return 'WordInfo' + repr((self.spamcount, self.hamcount)) def __getstate__(self): return (self.spamcount, self.hamcount) def __setstate__(self, t): (self.spamcount, self.hamcount) = t class Classifier: WordInfoClass = WordInfo def __init__(self): self.wordinfo = { } self.probcache = { } self.nspam = self.nham = 0 def __getstate__(self): return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham) def __setstate__(self, t): if t[0] != PICKLE_VERSION: raise ValueError("Can't unpickle -- version %s unknown" % t[0]) (self.wordinfo, self.nspam, self.nham) = t[1:] self.probcache = { } def chi2_spamprob(self, wordstream, evidence = False): '''Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. ''' frexp = frexp ln = log import math H = S = 1.0 Hexp = Sexp = 0 clues = self._getclues(wordstream) for prob, word, record in clues: S *= 1.0 - prob H *= prob if S < 9.9999999999999998e-201: (S, e) = frexp(S) Sexp += e if H < 9.9999999999999998e-201: (H, e) = frexp(H) Hexp += e continue S = ln(S) + Sexp * LN2 H = ln(H) + Hexp * LN2 n = len(clues) if n: S = 1.0 - chi2Q(-2.0 * S, 2 * n) H = 1.0 - chi2Q(-2.0 * H, 2 * n) prob = ((S - H) + 1.0) / 2.0 else: prob = 0.5 def slurping_spamprob(self, wordstream, evidence = False): '''Do the standard chi-squared spamprob, but if the evidence leaves the score in the unsure range, and we have fewer tokens than max_discriminators, also generate tokens from the text obtained by following http URLs in the message.''' h_cut = options[('Categorization', 'ham_cutoff')] s_cut = options[('Categorization', 'spam_cutoff')] (prob, clues) = self.chi2_spamprob(wordstream, True) if evidence: return (prob, clues) return prob if options[('Classifier', 'use_chi_squared_combining')]: if options[('URLRetriever', 'x-slurp_urls')]: spamprob = slurping_spamprob else: spamprob = chi2_spamprob def learn(self, wordstream, is_spam): """Teach the classifier by example. wordstream is a word stream representing a message. If is_spam is True, you're telling the classifier this message is definitely spam, else that it's definitely not spam. """ if options[('Classifier', 'use_bigrams')]: wordstream = self._enhance_wordstream(wordstream) if options[('URLRetriever', 'x-slurp_urls')]: wordstream = self._add_slurped(wordstream) self._add_msg(wordstream, is_spam) def unlearn(self, wordstream, is_spam): '''In case of pilot error, call unlearn ASAP after screwing up. Pass the same arguments you passed to learn(). ''' if options[('Classifier', 'use_bigrams')]: wordstream = self._enhance_wordstream(wordstream) if options[('URLRetriever', 'x-slurp_urls')]: wordstream = self._add_slurped(wordstream) self._remove_msg(wordstream, is_spam) def probability(self, record): '''Compute, store, and return prob(msg is spam | msg contains word). This is the Graham calculation, but stripped of biases, and stripped of clamping into 0.01 thru 0.99. The Bayesian adjustment following keeps them in a sane range, and one that naturally grows the more evidence there is to back up a probability. ''' spamcount = record.spamcount hamcount = record.hamcount try: return self.probcache[spamcount][hamcount] except KeyError: pass if not self.nham: pass nham = float(1) if not self.nspam: pass nspam = float(1) if not hamcount <= nham: raise AssertionError, 'Token seen in more ham than ham trained.' hamratio = hamcount / nham if not spamcount <= nspam: raise AssertionError, 'Token seen in more spam than spam trained.' spamratio = spamcount / nspam prob = spamratio / (hamratio + spamratio) S = options[('Classifier', 'unknown_word_strength')] StimesX = S * options[('Classifier', 'unknown_word_prob')] n = hamcount + spamcount prob = (StimesX + n * prob) / (S + n) try: self.probcache[spamcount][hamcount] = prob except KeyError: self.probcache[spamcount] = { hamcount: prob } return prob def _add_msg(self, wordstream, is_spam): self.probcache = { } for word in Set(wordstream): record = self._wordinfoget(word) if record is None: record = self.WordInfoClass() self._wordinfoset(word, record) self._post_training() def _remove_msg(self, wordstream, is_spam): self.probcache = { } if is_spam: if self.nspam <= 0: raise ValueError('spam count would go negative!') self.nspam -= 1 elif self.nham <= 0: raise ValueError('non-spam count would go negative!') self.nham -= 1 for word in Set(wordstream): record = self._wordinfoget(word) if record is not None: if is_spam: if record.spamcount > 0: record.spamcount -= 1 elif record.hamcount > 0: record.hamcount -= 1 if 0 == 0: pass elif 0 == record.spamcount: self._wordinfodel(word) else: self._wordinfoset(word, record) 0 == record.spamcount self._post_training() def _post_training(self): '''This is called after training on a wordstream. Subclasses might want to ensure that their databases are in a consistent state at this point. Introduced to fix bug #797890.''' pass def _getclues(self, wordstream): mindist = options[('Classifier', 'minimum_prob_strength')] if options[('Classifier', 'use_bigrams')]: raw = [] push = raw.append pair = None seen = { pair: 1 } for i, token in enumerate(wordstream): if i: pair = 'bi:%s %s' % (last_token, token) last_token = token for clue, indices in ((token, (i,)), (pair, (i - 1, i))): if clue not in seen: seen[clue] = 1 tup = self._worddistanceget(clue) if tup[0] >= mindist: push((tup, indices)) tup[0] >= mindist raw.sort() raw.reverse() clues = [] push = clues.append seen = { } for tup, indices in raw: overlap = _[1] if not overlap: for i in indices: seen[i] = 1 push(tup) continue [] clues.reverse() else: clues = [] push = clues.append for word in Set(wordstream): tup = self._worddistanceget(word) if tup[0] >= mindist: push(tup) continue clues.sort() if len(clues) > options[('Classifier', 'max_discriminators')]: del clues[0:-options[('Classifier', 'max_discriminators')]] return [ t[1:] for t in clues ] def _worddistanceget(self, word): record = self._wordinfoget(word) if record is None: prob = options[('Classifier', 'unknown_word_prob')] else: prob = self.probability(record) distance = abs(prob - 0.5) return (distance, prob, word, record) def _wordinfoget(self, word): return self.wordinfo.get(word) def _wordinfoset(self, word, record): self.wordinfo[word] = record def _wordinfodel(self, word): del self.wordinfo[word] def _enhance_wordstream(self, wordstream): '''Add bigrams to the wordstream. For example, a b c -> a b "a b" c "b c" Note that these are *token* bigrams, and not *word* bigrams - i.e. \'synthetic\' tokens get bigram\'ed, too. The bigram token is simply "bi:unigram1 unigram2" - a space should be sufficient as a separator, since spaces aren\'t in any other tokens, apart from \'synthetic\' ones. The "bi:" prefix is added to avoid conflict with tokens we generate (like "subject: word", which could be "word" in a subject, or a bigram of "subject:" and "word"). If the "Classifier":"use_bigrams" option is removed, this function can be removed, too. ''' last = None for token in wordstream: yield token if last: yield 'bi:%s %s' % (last, token) last = token def _generate_slurp(self): if not hasattr(self, 'setup_done'): self.setup() self.setup_done = True if not hasattr(self, 'do_slurp') or self.do_slurp: if slurp_wordstream: self.do_slurp = False tokens = self.slurp(*slurp_wordstream) self.do_slurp = True self._save_caches() return tokens return [] def setup(self): ExpiryFileCorpus = ExpiryFileCorpus FileMessageFactory = FileMessageFactory import spambayes.FileCorpus username = options[('globals', 'proxy_username')] password = options[('globals', 'proxy_password')] server = options[('globals', 'proxy_server')] if server.find(':') != -1: (server, port) = server.split(':', 1) else: port = 8080 if server: proxy_support = urllib2.ProxyHandler({ 'http': 'http://%s:%s@%s:%d' % (username, password, server, port) }) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) else: opener = urllib2.build_opener(urllib2.HTTPHandler) urllib2.install_opener(opener) age = options[('URLRetriever', 'x-cache_expiry_days')] * 24 * 60 * 60 dir = options[('URLRetriever', 'x-cache_directory')] if not os.path.exists(dir): if options[('globals', 'verbose')]: print >>sys.stderr, 'Creating URL cache directory' os.makedirs(dir) self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize = 20) self.urlCorpus.removeExpiredMessages() self.bad_url_cache_name = os.path.join(dir, 'bad_urls.pck') self.http_error_cache_name = os.path.join(dir, 'http_error_urls.pck') if os.path.exists(self.bad_url_cache_name): b_file = file(self.bad_url_cache_name, 'r') try: self.bad_urls = pickle.load(b_file) except IOError: ValueError = None if options[('globals', 'verbose')]: print >>sys.stderr, 'Bad URL pickle, using new.' self.bad_urls = { 'url:non_resolving': (), 'url:non_html': (), 'url:unknown_error': () } b_file.close() elif options[('globals', 'verbose')]: print "URL caches don't exist: creating" self.bad_urls = { 'url:non_resolving': (), 'url:non_html': (), 'url:unknown_error': () } if os.path.exists(self.http_error_cache_name): h_file = file(self.http_error_cache_name, 'r') try: self.http_error_urls = pickle.load(h_file) except IOError: ValueError = None if options[('globals', 'verbose')]: print >>sys.stderr, 'Bad HHTP error pickle, using new.' self.http_error_urls = { } h_file.close() else: self.http_error_urls = { } def _save_caches(self): for name, data in [ (self.bad_url_cache_name, self.bad_urls), (self.http_error_cache_name, self.http_error_urls)]: cache = open(name + '.tmp', 'w') pickle.dump(data, cache) cache.close() try: os.rename(name + '.tmp', name) continue except OSError: os.remove(name) os.rename(name + '.tmp', name) continue def slurp(self, proto, url): if not url: return [ 'url:non_resolving'] Tokenizer = Tokenizer import spambayes.tokenizer if options[('URLRetriever', 'x-only_slurp_base')]: url = self._base_url(url) for err in self.bad_urls.keys(): if url in self.bad_urls[err]: return [ err] continue if self.http_error_urls.has_key(url): return self.http_error_urls[url] mo = DOMAIN_AND_PORT_RE.match(url) domain = mo.group(1) if mo.group(3) is None: port = 80 else: port = mo.group(3) try: not_used = socket.getaddrinfo(domain, port) except socket.error: self.bad_urls['url:non_resolving'] += (url,) return [ 'url:non_resolving'] url_key = URL_KEY_RE.sub('_', url) cached_message = self.urlCorpus.get(url_key) if cached_message is None: parts = url.split('.') if parts[-1] in ('jpg', 'gif', 'png', 'css', 'js'): self.bad_urls['url:non_html'] += (url,) return [ 'url:non_html'] try: timeout = socket.getdefaulttimeout() socket.setdefaulttimeout(5) except AttributeError: pass try: if options[('globals', 'verbose')]: print >>sys.stderr, 'Slurping', url f = urllib2.urlopen('%s://%s' % (proto, url)) except (urllib2.URLError, socket.error): details = None mo = HTTP_ERROR_RE.match(str(details)) if mo: self.http_error_urls[url] = 'url:http_' + mo.group(1) return [ 'url:http_' + mo.group(1)] self.bad_urls['url:unknown_error'] += (url,) return [ 'url:unknown_error'] try: socket.setdefaulttimeout(timeout) except AttributeError: pass try: content_type = f.info().get('content-type') if content_type is None or not content_type.startswith('text/html'): self.bad_urls['url:non_html'] += (url,) return [ 'url:non_html'] page = f.read() headers = str(f.info()) f.close() except socket.error: return [] fake_message_string = headers + '\r\n' + page message = self.urlCorpus.makeMessage(url_key, fake_message_string) self.urlCorpus.addMessage(message) else: fake_message_string = cached_message.as_string() msg = message_from_string(fake_message_string) bht = options[('Tokenizer', 'basic_header_tokenize')] bhto = options[('Tokenizer', 'basic_header_tokenize_only')] options[('Tokenizer', 'basic_header_tokenize')] = True options[('Tokenizer', 'basic_header_tokenize_only')] = True tokens = Tokenizer().tokenize(msg) pf = options[('URLRetriever', 'x-web_prefix')] tokens = [ '%s%s' % (pf, tok) for tok in tokens ] options[('Tokenizer', 'basic_header_tokenize')] = bht options[('Tokenizer', 'basic_header_tokenize_only')] = bhto return tokens def _base_url(self, url): url += '/' (domain, garbage) = url.split('/', 1) parts = domain.split('.') if len(parts) > 2: base_domain = parts[-2] + '.' + parts[-1] if len(parts[-1]) < 3: base_domain = parts[-3] + '.' + base_domain else: base_domain = domain return base_domain def _add_slurped(self, wordstream): """Add tokens generated by 'slurping' (i.e. tokenizing the text at the web pages pointed to by URLs in messages) to the wordstream.""" for token in wordstream: yield token slurped_tokens = self._generate_slurp() for token in slurped_tokens: yield token def _wordinfokeys(self): return self.wordinfo.keys() Bayes = Classifier